{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# movielens电影评分数据分析(上)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 从用户表读取用户信息\n",
    "users = pd.read_table('users.dat', header=None, names=['UserID','Gender','Age','Occupation','Zip-code'], sep='::',engine='python')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6040\n"
     ]
    }
   ],
   "source": [
    "# 打印列表长度，共有6040条记录\n",
    "print(len(users))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>UserID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Age</th>\n",
       "      <th>Occupation</th>\n",
       "      <th>Zip-code</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>F</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>48067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>M</td>\n",
       "      <td>56</td>\n",
       "      <td>16</td>\n",
       "      <td>70072</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>M</td>\n",
       "      <td>25</td>\n",
       "      <td>15</td>\n",
       "      <td>55117</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>M</td>\n",
       "      <td>45</td>\n",
       "      <td>7</td>\n",
       "      <td>02460</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>M</td>\n",
       "      <td>25</td>\n",
       "      <td>20</td>\n",
       "      <td>55455</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   UserID Gender  Age  Occupation Zip-code\n",
       "0       1      F    1          10    48067\n",
       "1       2      M   56          16    70072\n",
       "2       3      M   25          15    55117\n",
       "3       4      M   45           7    02460\n",
       "4       5      M   25          20    55455"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看前五条记录\n",
    "users.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000209\n",
      "   UserID  MovieID  Rating  Timestamp\n",
      "0       1     1193       5  978300760\n",
      "1       1      661       3  978302109\n",
      "2       1      914       3  978301968\n",
      "3       1     3408       4  978300275\n",
      "4       1     2355       5  978824291\n",
      "3883\n",
      "   MovieID                               Title                        Genres\n",
      "0        1                    Toy Story (1995)   Animation|Children's|Comedy\n",
      "1        2                      Jumanji (1995)  Adventure|Children's|Fantasy\n",
      "2        3             Grumpier Old Men (1995)                Comedy|Romance\n",
      "3        4            Waiting to Exhale (1995)                  Comedy|Drama\n",
      "4        5  Father of the Bride Part II (1995)                        Comedy\n"
     ]
    }
   ],
   "source": [
    "# 同样方法，导入电影评分表\n",
    "ratings = pd.read_table('ratings.dat', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], sep='::',engine='python')\n",
    "# 打印列表长度\n",
    "print(len(ratings))\n",
    "print(ratings.head(5))\n",
    "# 同样方法，导入电影数据表\n",
    "movies = pd.read_table('movies.dat', header=None, names=['MovieID', 'Title', 'Genres'], sep='::',engine='python')\n",
    "print(len(movies))\n",
    "print(movies.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 合并数据表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导入完成之后，我们可以发现这三张表类似于数据库中的表\n",
    "# 要进行数据分析，我们就要将多张表进行合并才有助于分析 先将users与ratings两张表合并再跟movied合并\n",
    "data = pd.merge(pd.merge(users, ratings), movies)\n",
    "data.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 对数据初步描述分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 查看数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 合并后的每一条记录反映了每个人的年龄，职业，性别，邮编，电影ID，评分，时间戳，电影信息，电影分类等一系列信息\n",
    "# 比如我们查看用户id为12的所有信息\n",
    "data[data.UserID==12]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 查看每一部电影不同性别的平均评分并计算分歧差值，之后排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 查看每一部电影不同性别的平均评分 data_gender接收\n",
    "data_gender=data.pivot_table(index='Title',columns='Gender',values='Rating')\n",
    "data_gender.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 查看电影分歧最大的那部电影，在原数据中体现\n",
    "data_gender['diff']=np.abs(data_gender.F-data_gender.M)\n",
    "print(data_gender.shape)\n",
    "data_gender.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 男女电影分歧最大进行排序 data_gender_sorted接收\n",
    "data_gender_sorted=data_gender.sort_values(by='diff',ascending=False)\n",
    "data_gender_sorted.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 算出每部电影平均得分并对其进行排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#算出每部电影平均得分并对其进行排序 data_mean_rating 接收\n",
    "data_mean_rating=data.pivot_table(index='Title',values='Rating')\n",
    "data_mean_rating['size'] = data_mean_rating.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对电影平均得分排序\n",
    "data_mean_rating_sorted=data_mean_rating.sort_values(by='Rating',ascending=False)\n",
    "data_mean_rating_sorted.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 查看评分次数多的电影并进行排序 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 查看评分次数多的电影并进行排序   data_rating_num接收\n",
    "data_rating_num = pd.crosstab(data.Title, data.Rating)\n",
    "data_rating_num['count'] = np.sum(data_rating_num, axis=1)\n",
    "data_rating_num.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#进行排序\n",
    "data_rating_num_sorted = data_rating_num.sort_values(by='count',ascending=False)\n",
    "print(data_rating_num_sorted.shape)\n",
    "data_rating_num_sorted.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# movielens电影评分数据分析(下)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 过滤掉评分条目数不足250条的电影"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#过滤掉评分条目数不足250条的电影\n",
    "data_rating_num_sorted = data_rating_num_sorted[data_rating_num_sorted['count']>250]\n",
    "print(data_rating_num_sorted.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#对评分数量进行排序，并取前20条数据\n",
    "data_rating_num_sorted.head(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 评分最高的十部电影\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#评分最高的十部电影\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 查看不同年龄的分布情况并且采用直方图进行可视化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 在原数据中标记出用户位于的年龄分组"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 电影评分表中计算不同类型电影的频数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#对数据进行规整-movies\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#删除level_1列，将columns为0的列重命名为genres,并重新定义数据框为movies_genres\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#将原movies数据中的genres列替换成movies_genres，得到规整化处理后的movies数据 \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#合并。构建电影评分数据集movie_ratings\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#计算movies_ratings中不同类型电影的频数\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
